Explore correlations¶
Correlations among
year,loan_grade,purposePlots presented in this section show distinct differences in the distribution of loan grades for different years and loan purposes, but no dramatic patterns.
The distribution of loan purposes varies only slightly as a function of year.
Correlations involving
loan_amntThe largest loans, in the range of \$36k to \$40k, have much better loan grades than smaller loans.
Loans in the range of \$5k to \$11k have somewhat better grades than other loans less than \$36k.
The year 2018 shows a distinct change in the distribution of loan amounts. Loans amounts that are multiple of \$5k become more frequent, as do loans of \$36k and above.
-
The loan term is strongly correlated with loan grade, tending to increase with poorer loan grade.
Almost all loans below \$10k have a term of 36 months. Loan terms tend to increase with increasing loan amount up to around \$25k, and the mean loan term for larger loans is a little below 50 months.
The distribution of loan term shows distinct variations as a function of year and loan purpose, but no strong patterns.
This notebook presents initial exploration of correlations involving selected features in the loan data.
Later notebooks present in-depth analysis of particular features, e.g., int_rate,
including correlations involving those features.
import re
import numpy as np
import pandas as pd
import plotly.express as px
from IPython.display import display
import notebook_tools.database as db
from notebook_tools.derived_features import get_year
from notebook_tools.feature_exploration import (
get_group_sizes,
get_value_counts,
style_value_counts,
)
loan_data = db.get_loan_data()
loan_metadata = db.get_loan_metadata()
loan_data = loan_data.assign(
year=get_year(loan_data, "issue_d"),
term=loan_data["term"].map(lambda n: str(n) + " months"),
)
Correlations among year, loan_grade, purpose¶
Distributions of individual features¶
year_counts = get_value_counts(loan_data["year"])
display(style_value_counts(year_counts))
| count | |
|---|---|
| year | |
| 2018 | 495,163 |
| 2017 | 443,447 |
| 2016 | 434,254 |
| 2015 | 420,954 |
| 2014 | 235,551 |
| 2013 | 134,786 |
| 2012 | 53,352 |
loan_data.groupby(by=["grade", "sub_grade"]).size().to_frame(name="count")
| count | ||
|---|---|---|
| grade | sub_grade | |
| A | A1 | 85648 |
| A2 | 68042 | |
| A3 | 71361 | |
| A4 | 92969 | |
| A5 | 104824 | |
| B | B1 | 123437 |
| B2 | 124476 | |
| B3 | 128488 | |
| B4 | 137166 | |
| B5 | 137455 | |
| C | C1 | 143589 |
| C2 | 128908 | |
| C3 | 127480 | |
| C4 | 125707 | |
| C5 | 115390 | |
| D | D1 | 80713 |
| D2 | 71379 | |
| D3 | 63469 | |
| D4 | 55718 | |
| D5 | 46984 | |
| E | E1 | 32673 |
| E2 | 29122 | |
| E3 | 26026 | |
| E4 | 22199 | |
| E5 | 22162 | |
| F | F1 | 13016 |
| F2 | 8987 | |
| F3 | 7553 | |
| F4 | 5909 | |
| F5 | 5009 | |
| G | G1 | 3963 |
| G2 | 2579 | |
| G3 | 2013 | |
| G4 | 1612 | |
| G5 | 1481 |
purpose_counts = get_value_counts(loan_data["purpose"])
display(style_value_counts(purpose_counts))
| count | |
|---|---|
| purpose | |
| debt_consolidation | 1,257,717 |
| credit_card | 511,384 |
| home_improvement | 147,222 |
| other | 134,972 |
| major_purchase | 48,121 |
| medical | 26,732 |
| small_business | 22,693 |
| car | 22,389 |
| vacation | 15,120 |
| moving | 14,760 |
| house | 13,705 |
| wedding | 1,351 |
| renewable_energy | 1,339 |
| educational | 2 |
to_plot = get_group_sizes(loan_data, group_by="issue_d")
fig = px.line(
to_plot,
x="issue_d",
y="count",
markers=True,
labels={"issue_d": "Loan date", "count": "Number of loans"},
hover_data={"count": ":.3s"},
title="Number of loans by date",
)
fig.show()
to_plot = get_group_sizes(loan_data, group_by="year")
fig = px.bar(
to_plot,
x="year",
y="count",
labels={"year": "Year", "count": "Number of loans"},
hover_data={"count": ":.3s"},
title="Number of loans by year",
)
fig.show()
to_plot = get_group_sizes(loan_data, group_by="grade")
fig = px.bar(
to_plot,
x="grade",
y="count",
labels={"grade": "Loan grade", "count": "Number of loans"},
hover_data={"count": ":.3s"},
title="Number of loans by loan grade",
)
fig.show()
to_plot = get_group_sizes(loan_data, group_by=["grade", "sub_grade"])
to_plot["sub_grade"] = to_plot["sub_grade"].str[1]
fig = px.bar(
to_plot,
x="grade",
y="count",
color="sub_grade",
labels={
"grade": "Loan grade",
"count": "Number of loans",
"sub_grade": "Sub-grade",
},
hover_data={"count": ":.3s"},
title="Number of loans by loan grade and sub-grade",
)
fig.show()
to_plot = get_group_sizes(loan_data, group_by="purpose").sort_values(
"count", ascending=False
)
# Save an ordered array of the loan purposes for use in later plotting.
ordered_loan_purposes = list(to_plot["purpose"])
fig = px.bar(
to_plot,
x="purpose",
y="count",
labels={"purpose": "Loan purpose", "count": "Number of loans"},
hover_data={"count": ":.3s"},
title="Number of loans by purpose",
)
fig.show()
Correlations between features¶
to_plot = get_group_sizes(loan_data, group_by=["year", "grade"])
fig = px.histogram(
to_plot,
x="year",
y="count",
color="grade",
barnorm="fraction",
labels={"year": "Year", "count": "Number of loans", "grade": "Grade"},
title="Distribution of loan grade by year",
height=400,
)
def clean_up_hovertemplate(trace):
trace.hovertemplate = trace.hovertemplate.replace("%{y}", "%{y:.3p}").replace(
"sum of Number of loans (normalized as fraction)", "Percentage"
)
fig.for_each_trace(clean_up_hovertemplate)
fig.update_yaxes(tickformat=".2p", title_text="Percentage of loans")
fig.show()
For a simpler view of correlations involving the loan grade, map the grades to numbers and calculate the mean grade for different groups.
loan_grade_mapper = {"A": 7, "B": 6, "C": 5, "D": 4, "E": 3, "F": 2, "G": 1}
to_plot = (
loan_data[["year", "grade"]]
.assign(grade=lambda df: df["grade"].map(loan_grade_mapper))
.groupby("year")
.mean()
.reset_index()
)
fig = px.bar(
to_plot,
x="year",
y="grade",
labels={"year": "Year", "grade": "Mean numeric grade"},
title="Mean numeric grades by year (A=7, B=6, C=5, ..., G=1)",
hover_data={"grade": ":.2f"},
)
fig.show()
to_plot = get_group_sizes(loan_data, group_by=["year", "purpose"])
fig = px.histogram(
to_plot,
x="year",
y="count",
color="purpose",
barnorm="fraction",
category_orders={"purpose": ordered_loan_purposes},
labels={"year": "Year", "count": "Number of loans", "purpose": "Purpose"},
title="Distribution of loan purpose by year",
height=500,
)
def clean_up_hovertemplate(trace):
trace.hovertemplate = trace.hovertemplate.replace("%{y}", "%{y:.3p}").replace(
"sum of Number of loans (normalized as fraction)", "Percentage"
)
fig.for_each_trace(clean_up_hovertemplate)
fig.update_yaxes(tickformat=".2p", title_text="Percentage of loans")
fig.show()
to_plot = get_group_sizes(loan_data, group_by=["purpose", "grade"])
fig = px.histogram(
to_plot,
x="purpose",
y="count",
color="grade",
barnorm="fraction",
category_orders={
"purpose": ordered_loan_purposes,
},
labels={"purpose": "Loan purpose", "count": "Number of loans", "grade": "Grade"},
title="Distribution of loan grade by loan purpose",
height=400,
)
def clean_up_hovertemplate(trace):
trace.hovertemplate = trace.hovertemplate.replace("%{y}", "%{y:.3p}").replace(
"sum of Number of loans (normalized as fraction)", "Percentage"
)
fig.for_each_trace(clean_up_hovertemplate)
fig.update_yaxes(tickformat=",.2p", title_text="Percentage of loans")
fig.show()
to_plot = (
loan_data[["purpose", "grade"]]
.assign(grade=lambda df: df["grade"].map(loan_grade_mapper))
.groupby("purpose")
.mean()
.reset_index()
)
fig = px.bar(
to_plot,
x="purpose",
y="grade",
category_orders={
"purpose": ordered_loan_purposes,
},
labels={"purpose": "Loan purpose", "grade": "Mean numeric grade"},
title="Mean numeric grades by loan purpose (A=7, B=6, C=5, ..., G=1)",
hover_data={"grade": ":.2f"},
)
fig.show()
Conclusions:
- The distribution of loan grades shows distinct variation for different years and loan purposes, but no dramatic patterns.
- The distribution of loan purposes varies only slightly as a function of year.
Correlations involving loan_amnt¶
For large data sets, the binning of histogram data should be done outside of plotly. The reason is that plotly does binning in JavaScript, and so unbinned data passed to plotly's histogram function becomes part of the javascript code stored with the notebook. For the current data set, this can increase the notebook size on disk by a factor of more than 100.
min = loan_data["loan_amnt"].min()
max = loan_data["loan_amnt"].max()
print(
'The minimum and maximum values of "loan_amnt" '
f"are ${min:,} and ${max:,}, respectively."
)
The minimum and maximum values of "loan_amnt" are $1,000.0 and $40,000.0, respectively.
# Define arrays / lists needed for binning the histogram and plotting the bins in
# plotly. Use $ in place of $ in order to avoid triggering math formatting.
loan_amnt_bins = np.linspace(1e3, 41e3, num=41)
loan_amnt_bin_labels = [f"[${left:d}k - ${left+1:d}k)" for left in range(1, 41)]
loan_amnt_tick_vals = loan_amnt_bin_labels[4::5]
loan_amnt_tick_text = [f"${left:d}k" for left in range(5, 45, 5)]
loan_data["loan_amnt_bin"] = pd.cut(
loan_data["loan_amnt"],
bins=loan_amnt_bins,
labels=loan_amnt_bin_labels,
right=False,
)
to_plot = get_group_sizes(loan_data, group_by="loan_amnt_bin")
fig = px.bar(
to_plot,
x="loan_amnt_bin",
y="count",
labels={"count": "Number of loans", "loan_amnt_bin": "Loan amount"},
title="Distribution of loan amount",
)
hovertemplate = "Loan amount=%{customdata}<br>Number of loans=%{y:.3s}<extra></extra>"
fig.update_traces(customdata=loan_amnt_bin_labels, hovertemplate=hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.show()
to_plot = get_group_sizes(loan_data, group_by=["loan_amnt_bin", "grade"])
fig = px.histogram(
to_plot,
x="loan_amnt_bin",
y="count",
color="grade",
barnorm="fraction",
labels={
"loan_amnt_bin": "Loan amount",
"count": "Number of loans",
"grade": "Grade",
},
title="Distribution of loan grade by loan amount",
)
def clean_up_hovertemplate(trace):
trace.customdata = loan_amnt_bin_labels
trace.hovertemplate = (
trace.hovertemplate.replace("%{x}", "%{customdata}")
.replace("%{y}", "%{y:.3p}")
.replace("sum of Number of loans (normalized as fraction)", "Percentage")
)
fig.for_each_trace(clean_up_hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.update_yaxes(tickformat=",.2p", title_text="Percentage of loans")
fig.show()
to_plot = (
loan_data[["loan_amnt_bin", "grade"]]
.assign(grade=lambda df: df["grade"].map(loan_grade_mapper))
.groupby("loan_amnt_bin", observed=False)
.mean()
.reset_index()
)
fig = px.bar(
to_plot,
x="loan_amnt_bin",
y="grade",
labels={
"loan_amnt_bin": "Loan amount",
"grade": "Mean numeric grade",
},
title="Mean numeric grades by loan amount (A=7, B=6, C=5, ..., G=1)",
)
hovertemplate = (
"Loan amount=%{customdata}<br>Mean numeric grade=%{y:.2f}<extra></extra>"
)
fig.update_traces(customdata=loan_amnt_bin_labels, hovertemplate=hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.show()
to_plot = get_group_sizes(loan_data, group_by=["loan_amnt_bin", "year"])
fig = px.histogram(
to_plot,
x="loan_amnt_bin",
y="count",
color="year",
barnorm="fraction",
labels={
"loan_amnt_bin": "Loan amount",
"count": "Number of loans",
"year": "Year",
},
title="Distribution of loan year by loan amount",
)
def clean_up_hovertemplate(trace):
trace.customdata = loan_amnt_bin_labels
trace.hovertemplate = (
trace.hovertemplate.replace("%{x}", "%{customdata}")
.replace("%{y}", "%{y:.3p}")
.replace("sum of Number of loans (normalized as fraction)", "Percentage")
)
fig.for_each_trace(clean_up_hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.update_yaxes(tickformat=",.2p", title_text="Percentage of loans")
fig.show()
encoded_year_mapper = {
"2012": 1,
"2013": 2,
"2014": 3,
"2015": 4,
"2016": 5,
"2017": 6,
"2018": 7,
}
to_plot = (
loan_data[["loan_amnt_bin", "year"]]
.assign(year=lambda df: df["year"].map(encoded_year_mapper))
.groupby("loan_amnt_bin", observed=False)
.mean()
.reset_index()
)
fig = px.bar(
to_plot,
x="loan_amnt_bin",
y="year",
labels={
"loan_amnt_bin": "Loan amount",
"year": "Mean encoded year",
},
title="Mean encoded year by loan amount (2018=7, 2017=6, 2015=5, ..., 2012=1)",
)
hovertemplate = "Loan amount=%{customdata}<br>Mean encoded year=%{y:.2f}<extra></extra>"
fig.update_traces(customdata=loan_amnt_bin_labels, hovertemplate=hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.show()
to_plot = get_group_sizes(loan_data, group_by=["loan_amnt_bin", "purpose"])
fig = px.histogram(
to_plot,
x="loan_amnt_bin",
y="count",
color="purpose",
barnorm="fraction",
category_orders={"purpose": ordered_loan_purposes},
labels={
"loan_amnt_bin": "Loan amount",
"count": "Number of loans",
"purpose": "Purpose",
},
title="Distribution of loan purpose by loan amount",
height=500,
)
def clean_up_hovertemplate(trace):
trace.customdata = loan_amnt_bin_labels
trace.hovertemplate = (
trace.hovertemplate.replace("%{x}", "%{customdata}")
.replace("%{y}", "%{y:.3p}")
.replace("sum of Number of loans (normalized as fraction)", "Percentage")
)
fig.for_each_trace(clean_up_hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.update_yaxes(tickformat=",.2p", title_text="Percentage of loans")
fig.show()
Conclusions:
- The largest loans, in the range of \$36k to \$40k, have much better loan grades than smaller loans.
- Loans in the range of \$5k to \$11k have somewhat better grades than other loans less than \$36k.
- The year 2018 shows a distinct change in the distribution of loan amounts. Loans amounts that are multiple of \$5k become more frequent, as do loans of \$36k and above.
Correlations involving term¶
term_counts = get_value_counts(loan_data["term"])
display(style_value_counts(term_counts))
| count | |
|---|---|
| term | |
| 36 months | 1,577,886 |
| 60 months | 639,621 |
to_plot = get_group_sizes(loan_data, group_by="term")
fig = px.bar(
to_plot,
x="term",
y="count",
labels={"term": "Loan term", "count": "Number of loans"},
hover_data={"count": ":.3s"},
title="Number of loans by loan term",
)
fig.show()
to_plot = get_group_sizes(loan_data, group_by=["grade", "term"])
fig = px.histogram(
to_plot,
x="grade",
y="count",
color="term",
barnorm="fraction",
labels={"grade": "Grade", "count": "Number of loans", "term": "Loan term"},
title="Distribution of loan term by grade",
)
def clean_up_hovertemplate(trace):
trace.hovertemplate = trace.hovertemplate.replace("%{y}", "%{y:.3p}").replace(
"sum of Number of loans (normalized as fraction)", "Percentage"
)
fig.for_each_trace(clean_up_hovertemplate)
fig.update_yaxes(tickformat=".2p", title_text="Percentage of loans")
fig.show()
to_plot = (
loan_data[["grade", "term"]]
.assign(
term=lambda df: df["term"].str.replace("months", "").str.strip().astype("Int64")
)
.groupby("grade")
.mean()
.reset_index()
)
fig = px.bar(
to_plot,
x="grade",
y="term",
labels={"grade": "Grade", "term": "Mean loan term"},
title="Mean loan term by grade",
hover_data={"term": ":.1f"},
)
def clean_up_hovertemplate(trace):
trace.hovertemplate = re.sub(r"(%{y.*?})", r"\1 months", trace.hovertemplate)
fig.for_each_trace(clean_up_hovertemplate)
fig.update_yaxes(title_text="Mean loan term (months)")
fig.show()
to_plot = get_group_sizes(loan_data, group_by=["year", "term"])
fig = px.histogram(
to_plot,
x="year",
y="count",
color="term",
barnorm="fraction",
labels={"year": "Year", "count": "Number of loans", "term": "Loan term"},
title="Distribution of loan term by year",
)
def clean_up_hovertemplate(trace):
trace.hovertemplate = trace.hovertemplate.replace("%{y}", "%{y:.3p}").replace(
"sum of Number of loans (normalized as fraction)", "Percentage"
)
fig.for_each_trace(clean_up_hovertemplate)
fig.update_yaxes(tickformat=".2p", title_text="Percentage of loans")
fig.show()
to_plot = (
loan_data[["year", "term"]]
.assign(
term=lambda df: df["term"].str.replace("months", "").str.strip().astype("Int64")
)
.groupby("year")
.mean()
.reset_index()
)
fig = px.bar(
to_plot,
x="year",
y="term",
labels={"year": "Year", "term": "Mean loan term"},
title="Mean loan term by year",
hover_data={"term": ":.1f"},
)
def clean_up_hovertemplate(trace):
trace.hovertemplate = re.sub(r"(%{y.*?})", r"\1 months", trace.hovertemplate)
fig.for_each_trace(clean_up_hovertemplate)
fig.update_yaxes(title_text="Mean loan term (months)")
fig.show()
to_plot = get_group_sizes(loan_data, group_by=["purpose", "term"])
fig = px.histogram(
to_plot,
x="purpose",
y="count",
color="term",
barnorm="fraction",
category_orders={"purpose": ordered_loan_purposes},
labels={"purpose": "Loan purpose", "count": "Number of loans", "term": "Loan term"},
title="Distribution of loan term by loan purpose",
)
def clean_up_hovertemplate(trace):
trace.hovertemplate = trace.hovertemplate.replace("%{y}", "%{y:.3p}").replace(
"sum of Number of loans (normalized as fraction)", "Percentage"
)
fig.for_each_trace(clean_up_hovertemplate)
fig.update_yaxes(tickformat=".2p", title_text="Percentage of loans")
fig.show()
to_plot = (
loan_data[["purpose", "term"]]
.assign(
term=lambda df: df["term"].str.replace("months", "").str.strip().astype("Int64")
)
.groupby("purpose")
.mean()
.reset_index()
)
fig = px.bar(
to_plot,
x="purpose",
y="term",
category_orders={"purpose": ordered_loan_purposes},
labels={"purpose": "Loan purpose", "term": "Mean loan term"},
title="Mean loan term by loan purpose",
hover_data={"term": ":.1f"},
)
def clean_up_hovertemplate(trace):
trace.hovertemplate = re.sub(r"(%{y.*?})", r"\1 months", trace.hovertemplate)
fig.for_each_trace(clean_up_hovertemplate)
fig.update_yaxes(title_text="Mean loan term (months)")
fig.show()
to_plot = get_group_sizes(loan_data, group_by=["loan_amnt_bin", "term"])
fig = px.histogram(
to_plot,
x="loan_amnt_bin",
y="count",
color="term",
barnorm="fraction",
labels={
"loan_amnt_bin": "Loan amount",
"count": "Number of loans",
"term": "Loan term",
},
title="Distribution of loan term by loan amount",
)
def clean_up_hovertemplate(trace):
trace.customdata = loan_amnt_bin_labels
trace.hovertemplate = (
trace.hovertemplate.replace("%{x}", "%{customdata}")
.replace("%{y}", "%{y:.3p}")
.replace("sum of Number of loans (normalized as fraction)", "Percentage")
)
fig.for_each_trace(clean_up_hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.update_yaxes(tickformat=",.2p", title_text="Percentage of loans")
fig.show()
to_plot = (
loan_data[["loan_amnt_bin", "term"]]
.assign(
term=lambda df: df["term"].str.replace("months", "").str.strip().astype("Int64")
)
.groupby("loan_amnt_bin", observed=False)
.mean()
.reset_index()
)
fig = px.bar(
to_plot,
x="loan_amnt_bin",
y="term",
labels={
"loan_amnt_bin": "Loan amount",
"term": "Mean loan term",
},
title="Mean loan term by loan amount",
hover_data={"term": ":.1f"},
)
def clean_up_hovertemplate(trace):
trace.hovertemplate = re.sub(r"(%{y.*?})", r"\1 months", trace.hovertemplate)
fig.for_each_trace(clean_up_hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.show()
Conclusions:
The loan term is strongly correlated with loan grade, tending to increase with poorer loan grade.
Almost all loans below \$10k have a term of 36 months. Loan terms tend to increase with increasing loan amount up to around \$25k, and the mean loan term for larger loans is a little below 50 months.
The distribution of loan term shows distinct variations as a function of year and loan purpose, but no strong patterns.